For this assignment, we have scraped "computer" based books data.
In total, we have a dataset with 39,990 rows and 15 features, consisting of all predictive variables.
Each row in the dataset represents one unique book details corresponding to various attributes such as book ID, author, title, publisher, edition, etc. The features provided for each record are:
Let's proceed to the second step and apply the data wrangling steps. In this, we will read our dataset from the provided file in CSV format.
In this case, our data source is request API and will gather data from this single source.
# importing the necessary libraries for the analysis
import pandas as pd # for data maniulation
import numpy as np # for numerical operations
import matplotlib.pyplot as plt # for visualization
import seaborn as sns # for statistical visualization
import re # for string manipulation
from PIL import Image # for image processing
from wordcloud import WordCloud, STOPWORDS # for generating word clouds
from nltk.corpus import stopwords # for dealing with stopwords
from nltk.tokenize import word_tokenize # for tokenization
import ydata_profiling as pp # for generating pandas profiling
import warnings # to ignore warnings
warnings.filterwarnings('ignore') # ignore warnings during execution
# import the necessary libraries for the analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from kneed import KneeLocator #need to be installed before running
from PIL import Image
from IPython.display import clear_output
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ydata_profiling import ProfileReport
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import chain
# import the necessary libraries for the analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from PIL import Image
import scipy.stats as stats
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from ydata_profiling import ProfileReport
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer #pip install this please
# loading the dataset
df = pd.read_csv('api_metod_40k_15_fields.csv')
# display the first 5 rows of the dataset
df.head()
| Title | id | price | author | publisher | pub_year | s_tile | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.990000 | Jon Bodner | O'Reilly Media | 2024 | NaN | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10T00:00:00 | Jan 2024 | 1 | 97 | 150 |
| 1 | Tidy First? | 211127822 | 42.990000 | Kent Beck | O'Reilly Media | 2023 | NaN | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17T00:00:00 | Oct 2023 | 1 | 97 | 150 |
| 2 | Hands-On Machine Learning with Scikit-Learn, K... | 210681725 | 96.990000 | Aurélien Géron | O'Reilly Media | 2022 | NaN | (3rd ed.) | Through a recent series of breakthroughs, deep... | True | 2022-10-04T00:00:00 | Oct 2022 | 1 | 97 | 150 |
| 3 | Designing Data-Intensive Applications | 95729334 | 67.990000 | Martin Kleppmann | O'Reilly Media | 2017 | The Big Ideas Behind Reliable, Scalable, and M... | NaN | Data is at the center of many challenges in sy... | True | 2017-03-16T00:00:00 | Mar 2017 | 1 | 97 | 150 |
| 4 | Exam Ref MS-102 Microsoft 365 Administrator | 210964419 | 53.862731 | Orin Thomas | Pearson Education | 2023 | NaN | NaN | Prepare for Microsoft Exam MS-102 and help dem... | True | 2023-10-18T00:00:00 | Oct 2023 | 1 | 97 | 150 |
# dimension (rows and columns) of the dataset
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Shape (rows, columns): {df.shape}")
Number of rows: 39990 Number of columns: 15 Shape (rows, columns): (39990, 15)
As we can see, our dataset has 39,990 rows and 15 columns.
Let's check our variables data types and look for corrupted data that we can filter out.
# data types of each colummn of our dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39990 entries, 0 to 39989 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 39990 non-null object 1 id 39990 non-null int64 2 price 39881 non-null float64 3 author 39981 non-null object 4 publisher 39990 non-null object 5 pub_year 39990 non-null int64 6 s_tile 26656 non-null object 7 edition_num 3613 non-null object 8 description 34382 non-null object 9 availabe 39990 non-null bool 10 sale_date 39990 non-null object 11 short_pub 39990 non-null object 12 num_of_author 39990 non-null int64 13 width 39990 non-null int64 14 height 39990 non-null int64 dtypes: bool(1), float64(1), int64(5), object(8) memory usage: 4.3+ MB
# checking for null or missing values
df.isnull().sum()
Title 0 id 0 price 109 author 9 publisher 0 pub_year 0 s_tile 13334 edition_num 36377 description 5608 availabe 0 sale_date 0 short_pub 0 num_of_author 0 width 0 height 0 dtype: int64
There are missing values in the price, s_title, edition_num and description columns.
Now, we gonna check the unique values for each column.
# Checking the unique values in each column
for col in df:
print(f"\033[94mUnique values in '{col}' column:")
print(df[col].unique(), end='\n\n')
Unique values in 'Title' column: ['Learning Go' 'Tidy First?' 'Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow' ... 'Charting the Topic Maps Research and Applications Landscape' 'Data Mining for Biomedical Applications' 'PRICAI 2006: Trends in Artificial Intelligence'] Unique values in 'id' column: [211190367 211127822 210681725 ... 2476718 2476736 2476777] Unique values in 'price' column: [ 71.99 42.99 96.99 ... 104.5515523 12.19162646 95.04818674] Unique values in 'author' column: ['Jon Bodner' 'Kent Beck' 'Aurélien Géron' ... 'Simon D. Parsons' 'Quiang Yang' 'Jérôme Lang'] Unique values in 'publisher' column: ["O'Reilly Media" 'Pearson Education' 'McGraw Hill LLC' 'Make Community, LLC' 'Cambridge University Press' 'Elluminet Press' 'Wiley' 'BCS Learning & Development Limited' 'Pragmatic Bookshelf' 'Flatiron Books' 'Packt Publishing' 'Certification Experts, LLC' 'CRC Press' 'Vytautas Alechnavicius' 'Skill Recordings Inc' 'Princeton University Press' 'Personal Evolution' 'Apress' 'Farrar, Straus and Giroux' 'No Starch Press' 'Basic Books' 'Shermin Voshmgir' 'MIT Press' 'Little, Brown and Company' 'DK Publishing' 'Elsevier Science' 'OUP Oxford' 'Springer Berlin Heidelberg' "St. Martin's Publishing Group" 'Svyatoslav Kotusev' 'W. W. Norton & Company' 'Springer Nature Singapore' 'Penguin Publishing Group' 'Holy Macro! Books' 'Philip Anderson' 'Canongate Books' 'Association for Computing Machinery and Morgan & Claypool Publishers' 'XML Press' 'Rivercat Books LLC' 'Applied Maths Ltd' 'Rufus Johnston' 'SA Publishing' 'Taylor and Francis' 'World Scientific Publishing Company' 'Taylor & Francis' 'PublicAffairs' 'IT Governance Publishing' 'Edward Elgar Publishing' 'Springer International Publishing' 'IRB' 'Orion' 'Springer New York' 'McFarland & Company, Inc., Publishers' 'Peter Lang Inc., International Academic Publishers' 'Unbound' 'Springer Fachmedien Wiesbaden' 'Bloomsbury Publishing' 'The Institution of Engineering and Technology' 'Mihails Konoplovs' 'Rockport Publishers' 'Mercury Learning and Information' 'Pastor Publishing Ltd' 'ANAYA MULTIMEDIA' 'Top Notch International' 'Houndstooth Press' 'Glen Jennings' 'Payload Media, Inc.' 'SitePoint' 'Kiet Huynh' 'Orange Education Pvt Ltd' 'Computertrainerin.de' 'Manning' 'Mindview LLC' 'GitforGits' 'Matti Charlton' 'Harish Bhat' 'Endeavor Technologies Inc.' 'ArgoLong Publishing' 'Joe Grant' 'Henry Holt and Co.' 'Lakeview Research' 'NYU Press' 'Addison-Wesley Longman' 'Tsunami Productions' 'Stonesong Digital LLC' 'Ingram Publishing' 'TeeBooks' 'Harvard Business Review Press' 'Rowman & Littlefield Publishers' 'Elluminet Press Ltd' 'Knopf Doubleday Publishing Group' 'Momentum Press' 'AMACOM' 'SAS Institute' 'HarperCollins Publishers' 'Wren Investment Group, LLC' 'SQLBI Corp.' 'Matthew Smith' 'Kogan Page' 'Artech House' 'Springer London' 'Grand Central Publishing' 'Hachette Books' 'Arcturus Digital Limited' 'Emerald Publishing Limited' 'Rocky Nook' 'NOLO' 'Scott La Counte' 'Springer Nature Switzerland' 'Asghar Ghori' 'Bayview Labs, LLC' 'Keyko Pty Ltd' 'Icon Books Ltd' 'Vieweg+Teubner Verlag' 'McGraw-Hill Education' 'New Age International Pvt. Ltd., Publishers' 'Reaktion Books' 'iUniverse' 'North Atlantic Books' 'Crowood' 'Potomac Books' 'Wolfram Media, Inc.' 'Open Road Media' 'Indiana University Press' 'Random House of Canada' 'Cornell University Press' 'New York Review Books' 'Springer US' 'University of Minnesota Press' 'Melville House' 'Amsterdam University Press' 'Austin Macauley Publishers' 'Columbia University Press' 'Hodder Education' 'Society of Photo-Optical Instrumentation Engineers (SPIE)' 'Facet Publishing' 'Royal Society of Chemistry' 'Vibrant Publishers' 'Tablo Pty Ltd' 'Fox Red Risk' 'Arun E Thomas' 'OnBelay Consulting, LLC' 'CADCIM Technologies' 'Applied Network Defense' 'The Friedman Archives' 'Pragmatic Engineer B.V' 'The Cyber Consultant' 'Systems Approach, LLC' 'SkyRocket Software' 'Indy Pub' 'Ted Padova' 'MindShare Press' 'Crown' 'Random House Publishing Group' 'Nerdy Books, LLC' 'University of Queensland Press' 'Muscle Joint Nerve' 'Polity Press' 'Marcel Dekker Inc' 'Library Association Publishing (Facet Publishing)' 'Allen & Unwin' 'SPIE ' 'Babelcube Inc.' 'Government Institutes' 'TSTC Publishing' 'Heinemann' 'Birkhäuser Boston' 'Oxford University Press' 'Chronicle Books LLC' 'Infinite Ideas' 'Melbourne University Publishing' 'EGEA Spa - Bocconi University Press' 'Little, Brown Book Group' 'Infobase Publishing' 'Lexington Books' 'Springer Netherlands' 'IOS Press' 'Jenny Stanford Publishing' 'Springer Japan' 'Apple Academic Press' 'Spektrum Akademischer Verlag' 'Atlantis Press' 'Ice Publications' 'SIGS' 'Random House' 'Transworld' 'Random House Worlds' 'Tor Publishing Group' 'Pan Macmillan' 'SAGE Publications' 'Ebury Publishing' 'Sourcebooks' 'Penguin Books Ltd' 'University of Chicago Press'] Unique values in 'pub_year' column: [2024 2023 2022 2017 2018 2021 2020 2019 2015 2013 2016 2014 2011 2006 2008 2012 2009 2007 2005 1994 2003 2010 1995 1997 2002 2000 1988 2001 1998 2004 1999 1996 1990 1753 1993 1992 1991 1980 1986 1987 2025 1989] Unique values in 's_tile' column: [nan 'The Big Ideas Behind Reliable, Scalable, and Maintainable Systems' "A Beginner's Guide to HTML, CSS, JavaScript, and Web Graphics" ... '9th Pacific Rim International Conference on Artificial Intelligence, Guilin, China, August 7-11, 2006, Proceedings' 'First International Conference, KSEM 2006, Guilin, China, August 5-8, 2006, Proceedings' 'Second IAPR Workshop, ANNPR 2006, Ulm, Germany, August 31-September 2, 2006, Proceedings'] Unique values in 'edition_num' column: [' (2nd ed.)' nan ' (3rd ed.)' ' (8th ed.)' ' (5th ed.)' ' (7th ed.)' ' (6th ed.)' ' (4th ed.)' ' (9th ed.)' ' (11th ed.)' ' (12th ed.)' ' (10th ed.)' ' (22nd ed.)' ' (14th ed.)' ' (13th ed.)' ' (19th ed.)' ' (18th ed.)' ' (57th ed.)' ' (15th ed.)' ' (31st ed.)' ' (16th ed.)' ' (21st ed.)'] Unique values in 'description' column: ["Go has rapidly become the preferred language for building web services. Plenty of tutorials are available to teach Go's syntax to developers with experience in other programming languages, but tutorials aren't enough. They don't teach Go's idioms, so developers end up recreating patterns that don't make sense in a Go context. This practical..." 'Tidying up messy software is a must. And that means breaking up the code to make it more readable, and using guard clauses and helping functions to make it understandable. In this practical guide, author Kent Beck, creator of Extreme Programming and pioneer of software patterns, suggests when and where you might apply tidyings in your code. ...' 'Through a recent series of breakthroughs, deep learning has boosted the entire field of machine learning. Now, even programmers who know close to nothing about this technology can use simple, efficient tools to implement programs capable of learning from data. This bestselling book uses concrete examples, minimal theory, and production-ready...' ... 'This book constitutes the refereed proceedings of the 9th Pacific Rim International Conference on Artificial Intelligence, PRICAI 2006, held in Guilin, China in August 2006. The book presents 81 revised full papers and 87 revised short papers...' 'Here are the refereed proceedings of the First International Conference on Knowledge Science, Engineering and Management, KSEM 2006, held in Guilin, China in August 2006 in conjunction with PRICAI 2006. The book presents 51 revised full papers and...' 'This book constitutes the refereed proceedings of the Second IAPR Workshop on Artificial Neural Networks in Pattern Recognition, ANNPR 2006, held in Ulm, Germany in August/September 2006. The 26 revised papers presented were carefully reviewed and...'] Unique values in 'availabe' column: [ True False] Unique values in 'sale_date' column: ['2024-01-10T00:00:00' '2023-10-17T00:00:00' '2022-10-04T00:00:00' ... '2005-06-28T00:00:00' '2006-02-28T00:00:00' '2008-02-20T00:00:00'] Unique values in 'short_pub' column: ['Jan 2024' 'Oct 2023' 'Oct 2022' 'Mar 2017' 'Feb 2022' 'Aug 2023' 'Sep 2022' 'May 2018' 'Feb 2024' 'Jun 2022' 'Mar 2022' 'Nov 2023' 'Oct 2021' 'Aug 2022' 'Jul 2020' 'Jan 2023' 'Sep 2019' 'Jun 2021' 'Oct 2019' 'May 2020' 'Nov 2021' 'Apr 2023' 'Dec 2022' 'Sep 2015' 'May 2023' 'Sep 2021' 'Mar 2020' 'Jul 2013' 'Jul 2017' 'Jan 2020' 'Jul 2021' 'May 2022' 'Mar 2021' 'Nov 2020' 'Nov 2019' 'Sep 2016' 'Apr 2019' 'Apr 2015' 'Sep 2014' 'Nov 2022' 'Dec 2019' 'Jan 2021' 'Sep 2023' 'Oct 2011' 'Mar 2023' 'Dec 2023' 'Dec 2024' 'Dec 2020' 'Aug 2019' 'Jul 2022' 'Apr 2020' 'Dec 2011' 'May 2006' 'Feb 2021' 'Dec 2014' 'Feb 2018' 'Jul 2018' 'Feb 2023' 'Dec 2015' 'Sep 2020' 'Jan 2018' 'Oct 2017' 'Apr 2022' 'Aug 2017' 'Jun 2023' 'Feb 2020' 'Jun 2013' 'Jul 2008' 'Apr 2021' 'Mar 2016' 'Jun 2020' 'Sep 2017' 'Dec 2021' 'Jun 2019' 'Aug 2021' 'May 2008' 'Oct 2015' 'Jan 2022' 'Dec 2018' 'Aug 2012' 'Jan 2016' 'Jan 2019' 'Feb 2019' 'Aug 2020' 'Jul 2009' 'Aug 2011' 'May 2013' 'Nov 2006' 'Jun 2016' 'Aug 2007' 'Dec 2007' 'Oct 2020' 'Nov 2015' 'Sep 2013' 'Jun 2007' 'Feb 2013' 'Jan 2015' 'Jul 2023' 'Jul 2005' 'Mar 2018' 'Jan 2012' 'Jul 2019' 'Jul 2012' 'Oct 1994' 'May 2021' 'Mar 2019' 'Aug 2003' 'Aug 2006' 'Aug 2016' 'Oct 2018' 'Nov 2005' 'Nov 2016' 'Aug 2010' 'Aug 2018' 'Dec 2013' 'Oct 2013' 'Sep 2018' 'Mar 2005' 'Nov 2014' 'Jan 2013' 'Aug 2008' 'Aug 1995' 'Apr 2005' 'Apr 2011' 'May 2005' 'Jun 2017' 'May 2019' 'May 2010' 'Apr 2018' 'Mar 1997' 'Sep 2002' 'Jan 2014' 'Feb 2010' 'Apr 2012' 'Apr 2013' 'Jan 2000' 'Feb 2012' 'Mar 1988' 'Feb 2009' 'May 2017' 'Jul 2015' 'Feb 2001' 'Jul 2016' 'Feb 2005' 'Jul 2006' 'Feb 1994' 'Apr 1998' 'Mar 2008' 'Oct 2002' 'Nov 2002' 'Dec 2016' 'Oct 2009' 'Dec 2005' 'Jun 2009' 'Mar 2014' 'Sep 2007' 'Jun 2006' 'Aug 2004' 'Oct 2010' 'Sep 2012' 'May 2016' 'Jun 2012' 'May 2004' 'Jun 2014' 'Feb 2015' 'Jun 2011' 'Mar 2007' 'Dec 2008' 'Mar 2013' 'Dec 2003' 'Jul 2003' 'Nov 2008' 'Nov 2003' 'Sep 2004' 'Mar 2006' 'Dec 2017' 'Apr 2009' 'Nov 2011' 'Nov 2018' 'Oct 2014' 'Jun 2015' 'Oct 2016' 'Jul 2014' 'Apr 2017' 'Jun 2008' 'Nov 2013' 'Feb 2011' 'Apr 2010' 'Jul 1998' 'May 2015' 'Mar 2009' 'Dec 2006' 'Nov 2010' 'Jun 2018' 'Dec 2002' 'Aug 1999' 'Oct 2005' 'Aug 2015' 'Jan 2005' 'Jan 2007' 'Apr 2007' 'Jul 2011' 'Apr 2006' 'Nov 2007' 'Sep 2000' 'Jun 2000' 'Mar 2015' 'Mar 2010' 'Aug 1997' 'Apr 2001' 'Dec 2001' 'Jul 2007' 'May 2012' 'Jan 2009' 'Aug 2014' 'May 2003' 'Feb 2004' 'Mar 2011' 'Sep 2009' 'Apr 2016' 'Mar 2004' 'Feb 2016' 'Feb 2014' 'Jun 2010' 'Dec 2009' 'Apr 2008' 'Sep 2003' 'Sep 2008' 'Nov 2012' 'Oct 1996' 'Oct 2007' 'Oct 2003' 'Jul 2010' 'Nov 2017' 'Jul 1997' 'Jul 1990' 'Sep 2006' 'Jan 1753' 'Apr 2014' 'Dec 2012' 'Sep 2011' 'Jan 2017' 'Nov 2004' 'Feb 2017' 'Jun 2004' 'Jun 2003' 'Oct 2000' 'Aug 2009' 'Oct 2001' 'Jun 2002' 'Sep 2010' 'Feb 2008' 'Oct 2008' 'Feb 2006' 'Apr 2002' 'Sep 2001' 'Oct 2006' 'Jan 2006' 'May 2014' 'Jan 2011' 'Jan 1999' 'Mar 2003' 'May 1993' 'Oct 2012' 'Oct 1998' 'Feb 2007' 'Aug 2005' 'May 2007' 'Apr 1997' 'Aug 2002' 'Dec 2010' 'Jan 2004' 'Nov 1998' 'Oct 1992' 'Jan 2002' 'May 1999' 'Jan 1995' 'Sep 1999' 'May 2009' 'Sep 2005' 'May 2011' 'Aug 2013' 'Mar 2012' 'Jan 2001' 'Nov 1999' 'Nov 2009' 'Dec 2004' 'Jan 2010' 'Jun 2005' 'Jan 2008' 'Oct 2004' 'May 2001' 'Dec 2000' 'Mar 2000' 'Apr 2004' 'Feb 2000' 'Aug 1991' 'Jan 1998' 'Feb 2003' 'May 1997' 'Dec 1994' 'Jul 2000' 'Jun 1994' 'Jul 2002' 'Jun 1980' 'Nov 2000' 'Jan 2003' 'Jul 2004' 'Mar 2001' 'Feb 1988' 'Aug 2001' 'Feb 1998' 'Aug 1998' 'Oct 1995' 'Apr 1999' 'Jun 2001' 'Nov 2001' 'Jul 2001' 'Jan 1993' 'Jan 1997' 'Apr 2000' 'Feb 1995' 'Jul 1995' 'Feb 1996' 'Mar 2002' 'Dec 1997' 'Feb 2002' 'May 2000' 'Mar 1999' 'Nov 1996' 'Apr 1996' 'Aug 1992' 'Feb 1999' 'May 2002' 'Jun 1996' 'Dec 1998' 'May 1991' 'Mar 1986' 'Sep 1997' 'Nov 1993' 'Apr 1990' 'Jul 1992' 'Nov 1991' 'Sep 1987' 'May 1996' 'Sep 1996' 'Apr 2003' 'Dec 1999' 'Jul 1999' 'May 1998' 'Jun 1988' 'Jun 1995' 'Feb 1997' 'Jan 1988' 'Oct 1999' 'Jul 2024' 'Jul 1996' 'Jun 1990' 'Oct 1991' 'Nov 1994' 'Sep 1991' 'Sep 1994' 'Jun 2024' 'Aug 1990' 'Mar 1998' 'Apr 2024' 'Mar 2024' 'Sep 2024' 'May 2024' 'Aug 2000' 'Mar 1993' 'Jun 1999' 'Nov 1997' 'Sep 1998' 'Nov 2024' 'Jan 1996' 'Nov 1992' 'Jul 1994' 'May 1995' 'Feb 1993' 'Dec 1992' 'Dec 1991' 'Aug 2024' 'Oct 2024' 'Feb 2025' 'Jan 2025' 'Mar 1992' 'Apr 1994' 'Dec 1995' 'Jun 1997' 'Apr 1992' 'Oct 1993' 'Jun 1998' 'Oct 1988' 'Apr 1989' 'Aug 1986' 'Mar 1991' 'Nov 1990' 'Sep 1992' 'Aug 1994' 'Jan 1989' 'Aug 1993' 'May 1992' 'Mar 1994' 'Dec 1993' 'Jan 1994' 'May 1990' 'Jun 1989' 'Jan 1990' 'Dec 1990' 'Dec 1989' 'Nov 1989' 'Jan 1986' 'Aug 1989' 'Aug 1996' 'Mar 1995' 'Sep 1995' 'Nov 1995' 'Oct 1997' 'Jul 1989' 'Sep 1989' 'Jan 1991' 'Jun 1986' 'Jun 1993' 'Jun 1992'] Unique values in 'num_of_author' column: [ 1 2 3 4 7 5 6 10 9 11 8 16 0 13 17 12 14 19 24 15 20 18 59 22 30] Unique values in 'width' column: [97] Unique values in 'height' column: [150]
# make a list of the variables that contain missing values
na_var = [var for var in df.columns if df[var].isnull().sum() > 0]
# determining percentage of missing values (expressed as decimals)
# and display the result ordered by % of missing data
df[na_var].isnull().mean().sort_values(ascending = False) * 100
edition_num 90.965241 s_tile 33.343336 description 14.023506 price 0.272568 author 0.022506 dtype: float64
Now we can clearly identify that the 'edition_num', 's_title', 'description', 'price' and 'author' columns have a considerable amount of missing values.
Let's drop 's_title' column to filter out the corrupted data as it won't add any value to our data.
Now we can clearly identify that the 'edition_num', 's_title', 'description', 'price' and 'author' columns have a considerable amount of missing values.
Let's drop 's_title' column to filter out the corrupted data as it won't add any value to our data.
Handling NAN's:
# Handling Missing Data: Price:
df['price'] = df['price'].fillna(value = df['price'].mean())
# Handline Missing Data: Author
df.dropna(subset=['author'],inplace=True)
# Let's drop the columns with missing values
df.drop(columns = ['s_tile'], inplace = True)
# checking again for null or missing values
df.isnull().sum()
Title 0 id 0 price 0 author 0 publisher 0 pub_year 0 edition_num 36368 description 5600 availabe 0 sale_date 0 short_pub 0 num_of_author 0 width 0 height 0 dtype: int64
Here:
df.isnull().sum().sum()
41968
# Displaying the number of duplicated rows
print("Number of duplicated rows:", df[df.duplicated()].shape[0])
Number of duplicated rows: 9153
# Visualizing some duplicated rows
df[df.duplicated()].head()
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30 | Getting Started with Processing | 2208258 | 14.99000 | Casey Reas | Make Community, LLC | 2015 | (2nd ed.) | Processing opened up the world of programming ... | True | 2015-09-09T00:00:00 | Sep 2015 | 2 | 97 | 150 |
| 60 | REST API Design Rulebook | 801433 | 10.99000 | Mark Masse | O'Reilly Media | 2011 | NaN | In today’s market, where rival web services co... | True | 2011-10-18T00:00:00 | Oct 2011 | 1 | 97 | 150 |
| 90 | (ISC)2 CISSP Certified Information Systems Sec... | 210320755 | 50.00000 | Mike Chapple | Wiley | 2021 | (3rd ed.) | Full-length practice tests covering all CISSP ... | True | 2021-06-16T00:00:00 | Jun 2021 | 2 | 97 | 150 |
| 140 | Fluent C | 210689032 | 71.99000 | Christopher Preschern | O'Reilly Media | 2022 | NaN | Expert advice on C programming is hard to find... | True | 2022-10-17T00:00:00 | Oct 2022 | 1 | 97 | 150 |
| 180 | CISSP All-in-One Exam Guide, Ninth Edition | 210393469 | 94.28335 | Fernando Maymi | McGraw Hill LLC | 2021 | (9th ed.) | A new edition of Shon Harris’ bestselling exam... | True | 2021-11-12T00:00:00 | Nov 2021 | 2 | 97 | 150 |
# Let's get rid of duplicate entries
df.drop_duplicates(keep = 'first', inplace = True)
# Again check the dataset shape, after removing the duplicate entries
print(f'\033[94mNumber of records (rows) in the dataset are: {df.shape[0]}')
print(f'\033[94mNumber of features (columns) in the dataset are: {df.shape[1]}')
Number of records (rows) in the dataset are: 30828 Number of features (columns) in the dataset are: 14
We can notice that we have eliminated 24 duplicated rows.
Now, let's identify our numerical and categorical variables so we can perform further text preprocessing upon them.
# Show columns names
df.columns
Index(['Title', 'id', 'price', 'author', 'publisher', 'pub_year',
'edition_num', 'description', 'availabe', 'sale_date', 'short_pub',
'num_of_author', 'width', 'height'],
dtype='object')
num_columns = [col for col in df.columns if df[col].dtypes != 'O']
cat_columns = [col for col in df.columns if df[col].dtypes == 'O']
print("Number of Numerical Features: ", len(num_columns))
print("Numerical Features: ", end = '')
print(num_columns)
print("\nNumber of Categorical Features: ", len(cat_columns))
print("Categorical Features: ", end = '')
print(cat_columns)
Number of Numerical Features: 7 Numerical Features: ['id', 'price', 'pub_year', 'availabe', 'num_of_author', 'width', 'height'] Number of Categorical Features: 7 Categorical Features: ['Title', 'author', 'publisher', 'edition_num', 'description', 'sale_date', 'short_pub']
Function to handle emoji classification (if there is any in text)
def emoji(title_data):
title_data = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positiveemoji ', title_data)
title_data = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positiveemoji ', title_data)
title_data = re.sub(r'(<3|:\*)', ' positiveemoji ', title_data)
title_data = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positiveemoji ', title_data)
title_data = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negetiveemoji ', title_data)
title_data = re.sub(r'(:,\(|:\'\(|:"\()', ' negetiveemoji ', title_data)
return title_data
Function to clean the 'title' and 'description' column
def process_title(title_data):
if isinstance(title_data, float): # Convert float to string if title_data has a float value
title_data = "{:.2f}".format(title_data) # Adjust the precision as needed
else:
title_data = str(title_data)
title_data = title_data.lower() # Lowercases the string
title_data = re.sub('@[^\s]+', '', title_data) # Removes usernames
title_data = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', title_data) # Remove URLs
title_data = re.sub(r"\d+", " ", str(title_data)) # Removes all digits
title_data = re.sub('"'," ", title_data) # Remove (")
title_data = emoji(title_data) # Replaces Emojis
title_data = re.sub(r"\b[a-zA-Z]\b", "", str(title_data)) # Removes all single characters
title_data = re.sub(r"[^\w\s]", " ", str(title_data)) # Removes all punctuations
title_data = re.sub(r'(.)\1+', r'\1\1', title_data) # Convert more than 2 letter repetitions to 2 letter
title_data = re.sub(r"\s+", " ", str(title_data)) # Replaces double spaces with single space
return title_data
Function to check if the spelling is correct or not
def remove_meaningless(df, col):
df[col] = df[col].apply(lambda x: TextBlob(x).correct())
Function to check stopwords as well as remove them
s_words = list(stopwords.words('english'))
from collections import Counter
c = Counter()
extra_s= ["what", "us", "this", "well", "there", "much", "us", "and", "you're", "in", "where", "when"," just", "how", "is",
"ha", "re", "are", "hi", "aren't", 'couldn', 'could', 'couldnt', "couldn't", 'did', 'had', 'have', 'must', 'does',
'should', 'was', "it's", "didn't", "doesn't", "don't", "hadn't", "hasn't", "haven't", "isn't", 'let', 'll', "may",
'were', 'is', 'has', 'must', 'mustn', 'rt', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn', 'realli', 'now',
'got', 'man', 'people', 'a', 'becaus', 'caus', "one", "im", "guy", "someone", "two", 'read', "nearby", "i", "he's",
"she's", "we", "it", "they", "wouldn’t", "i've", 'aren', 'didn', 'doesn', 'don', 'hadn', 'hasn', 'star', 'haven', 'isn','great', 'subscription', 'sdidn', 've']
stop_words = list(STOPWORDS) + list(s_words) + list(extra_s)
def remove_stopwords(df, column):
df[column] = df[column].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words ]))
df['processed_title'] = np.vectorize(process_title)(df['Title'])
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | processed_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10T00:00:00 | Jan 2024 | 1 | 97 | 150 | learning go |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17T00:00:00 | Oct 2023 | 1 | 97 | 150 | tidy first |
df['processed_description'] = np.vectorize(process_title)(df['description'])
remove_stopwords(df = df, column = 'processed_description')
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | processed_title | processed_description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10T00:00:00 | Jan 2024 | 1 | 97 | 150 | learning go | go rapidly become preferred language building ... |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17T00:00:00 | Oct 2023 | 1 | 97 | 150 | tidy first | tidying messy software means breaking code mak... |
# Removing the hyphen from the 'short_pub' column's values
df['short_pub'] = df['short_pub'].str.replace('-', ' ')
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | processed_title | processed_description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10T00:00:00 | Jan 2024 | 1 | 97 | 150 | learning go | go rapidly become preferred language building ... |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17T00:00:00 | Oct 2023 | 1 | 97 | 150 | tidy first | tidying messy software means breaking code mak... |
# Define the bins for price ranges
bins = [0, 100, 500, 1000, 1500, 2000, 2500]
# Create labels for the bins
labels = ['0-100', '101-500', '501-1000', '1001-1500', '1501-2000', '2001-2500']
# Create a new column with the bin labels based on the 'price' column
df['price_range'] = pd.cut(df['price'], bins = bins, labels = labels, right = False)
df.price_range.value_counts().sort_values()
2001-2500 1 1001-1500 2 1501-2000 2 501-1000 42 101-500 7206 0-100 23575 Name: price_range, dtype: int64
# converting the 'sale_date' column to datetime format
df['sale_date'] = pd.to_datetime(df['sale_date'])
# extracting the time components from the 'sale_date' feature
# df['hours'] = df['sale_date'].dt.hour
# df['minutes'] = df['sale_date'].dt.minute
# df['seconds'] = df['sale_date'].dt.second
# Extracting date, month and year of publication to have a better understanding of the dataset
df['month_of_sale'] = df['sale_date'].dt.month
df['date_of_sale'] = df['sale_date'].dt.day
df['year_of_sale'] = df['sale_date'].dt.year
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | short_pub | num_of_author | width | height | processed_title | processed_description | price_range | month_of_sale | date_of_sale | year_of_sale | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10 | Jan 2024 | 1 | 97 | 150 | learning go | go rapidly become preferred language building ... | 0-100 | 1 | 10 | 2024 |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17 | Oct 2023 | 1 | 97 | 150 | tidy first | tidying messy software means breaking code mak... | 0-100 | 10 | 17 | 2023 |
# round off the 'price' column to two decimal places
df['updated_price'] = df['price'].round(2)
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | ... | num_of_author | width | height | processed_title | processed_description | price_range | month_of_sale | date_of_sale | year_of_sale | updated_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10 | ... | 1 | 97 | 150 | learning go | go rapidly become preferred language building ... | 0-100 | 1 | 10 | 2024 | 71.99 |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17 | ... | 1 | 97 | 150 | tidy first | tidying messy software means breaking code mak... | 0-100 | 10 | 17 | 2023 | 42.99 |
2 rows × 21 columns
# creating a new column 'edition' which will have only only book's edition number
df['edition'] = df['edition_num'].str.extract(r'(\d+)')
df.head(2)
| Title | id | price | author | publisher | pub_year | edition_num | description | availabe | sale_date | ... | width | height | processed_title | processed_description | price_range | month_of_sale | date_of_sale | year_of_sale | updated_price | edition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Learning Go | 211190367 | 71.99 | Jon Bodner | O'Reilly Media | 2024 | (2nd ed.) | Go has rapidly become the preferred language f... | True | 2024-01-10 | ... | 97 | 150 | learning go | go rapidly become preferred language building ... | 0-100 | 1 | 10 | 2024 | 71.99 | 2 |
| 1 | Tidy First? | 211127822 | 42.99 | Kent Beck | O'Reilly Media | 2023 | NaN | Tidying up messy software is a must. And that ... | True | 2023-10-17 | ... | 97 | 150 | tidy first | tidying messy software means breaking code mak... | 0-100 | 10 | 17 | 2023 | 42.99 | NaN |
2 rows × 22 columns
Creating an affordability column, based on subjective judgement.
df.updated_price.describe()
count 30828.000000 mean 87.497879 std 69.133299 min 0.000000 25% 49.990000 50% 72.640000 75% 94.440000 max 2353.370000 Name: updated_price, dtype: float64
- Most of the books price falls under 95 dollars, (75 percentile)
- 95 percent of the books fall under the price of 270 dollars
- max value is 2353 dollars.
- average book costs around 87 dollars
df.price_range.value_counts().sort_values(ascending=False)
0-100 23575 101-500 7206 501-1000 42 1001-1500 2 1501-2000 2 2001-2500 1 Name: price_range, dtype: int64
- judging by the frequency, price range of 60-80 is the most occured
- books of price range 300-600 and onwards are comparetively less.
- point to notice is that these ranges are scaled to match the frequencies, as there are books above 1000 dollars range, but less in frequency (count of 1-2 books)
# we can build a categorical feature where the feature describes the affordability of the book based on mean and frequency of prices.
# the feature has 3 values, 0: cheap; 1: average/affordable; 2: pricy
# the price ranges are: 0: <=60 $; 1: 60>=$,<=270$; 2: >270$
df['affordability'] = df['price'].apply(lambda x: 'Low price' if x <= 60 else ('Average price' if 60 <= x <= 270 else ('Over priced' if x >= 270 else None)))
# we notice some null values in the new feature as there are some records with missing price columns.
df['affordability'].isnull().sum()
0
df['affordability'].value_counts().sort_values(ascending=False)
Average price 18608 Low price 11317 Over priced 903 Name: affordability, dtype: int64
# # lets visualize the new col
plt.figure(figsize=(10,6))
x = df['affordability'].value_counts().sort_values(ascending=False)
bars=sns.barplot(x=x.index,y=x.values,palette='Blues')
for bar in bars.patches:
plt.annotate(str(bar.get_height()), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
xytext=(0, 1.25),
textcoords="offset points",
ha='center', va='bottom')
plt.title('Count of affordability column')
plt.xlabel('Affordability')
plt.ylabel('Count of Books')
plt.xticks(rotation = 45)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.7)
plt.tight_layout()
plt.show()
Why a new col 'affordability'?
#df.to_csv('preprocessed_data_40k.csv',index_label=False)
Let's explore the variables and find how they are distributed across our dataset.
Firstly, we gonna plot 'Book's Publication Distribution' as per 'Years'
# Count the occurrences of each year
yr_counts = df['pub_year'].value_counts().sort_index()
plt.figure(figsize = (10, 6))
plt.bar(yr_counts.index, yr_counts.values, color = 'skyblue')
plt.title('Publication Year Distribution')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.xticks(rotation = 45)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.7)
plt.tight_layout()
plt.show()
yr_counts
1753 438 1980 1 1986 4 1987 1 1988 6 1989 9 1990 12 1991 15 1992 12 1993 22 1994 22 1995 31 1996 29 1997 39 1998 55 1999 67 2000 103 2001 143 2002 256 2003 324 2004 345 2005 452 2006 677 2007 647 2008 710 2009 749 2010 882 2011 973 2012 1315 2013 1889 2014 1760 2015 1902 2016 2080 2017 2232 2018 2284 2019 2047 2020 2065 2021 2058 2022 2350 2023 1485 2024 316 2025 21 Name: pub_year, dtype: int64
# Bar Plot
plt.figure(figsize = (12, 6))
b_plot = sns.countplot(x = 'pub_year', data = df)
b_plot.set_title('Publication Year Distribution (1753-2024)')
b_plot.set_xlabel('Year')
b_plot.set_ylabel('Number of Publications')
b_plot.set_xticklabels(b_plot.get_xticklabels(), rotation = 45, ha = 'right')
sns.despine()
plt.figure(figsize = (18, 7))
b_plot = sns.countplot(x = 'pub_year', data = df)
b_plot.set_title('Publication Year Distribution (1753-2024)', fontdict = {'fontsize': 22})
b_plot.set_xlabel('Year', fontdict = {'fontsize': 20})
b_plot.set_ylabel('Number of Publications', fontdict={'fontsize': 16})
b_plot.set_xticklabels(b_plot.get_xticklabels(), rotation = 45, ha = 'right')
sns.despine()
# Adding count values on top of each bar
for p in b_plot.patches:
b_plot.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 3), textcoords = 'offset points')
According to the book's publication year data,
Now, we will plot the 'number of books' belonging to the particular 'price' ranges
list(df['updated_price'])
[71.99, 42.99, 96.99, 67.99, 53.86, 80.81, 53.86, 42.99, 67.99, 84.99, 84.99, 71.99, 84.99, 74.99, 84.99, 71.99, 84.99, 84.99, 71.99, 84.99, 58.99, 71.99, 84.99, 49.99, 84.99, 59.99, 84.99, 74.99, 84.99, 14.99, 79.99, 84.99, 84.99, 71.99, 56.99, 76.91, 71.99, 71.99, 27.99, 6.99, 59.99, 84.99, 84.99, 71.99, 84.99, 59.99, 52.07, 67.99, 59.99, 96.99, 71.99, 57.13, 84.99, 67.99, 71.99, 59.99, 63.99, 63.99, 10.99, 64.09, 49.99, 71.99, 71.99, 71.99, 16.99, 71.99, 67.35, 59.99, 71.99, 84.99, 71.99, 74.99, 84.99, 59.99, 59.99, 71.99, 71.99, 71.99, 71.99, 71.99, 71.99, 80.99, 48.48, 48.48, 59.99, 11.99, 71.99, 50.0, 67.99, 71.99, 76.0, 74.99, 71.99, 63.99, 49.99, 71.99, 59.99, 71.99, 98.55, 71.99, 45.99, 42.99, 64.64, 59.99, 71.99, 71.99, 49.99, 71.99, 47.99, 80.81, 84.99, 56.99, 71.99, 13.99, 102.01, 32.99, 84.99, 71.99, 35.99, 71.99, 71.99, 71.99, 84.99, 96.99, 59.99, 66.0, 53.86, 59.99, 71.99, 63.99, 88.0, 71.99, 80.81, 71.99, 71.99, 71.99, 71.99, 41.74, 84.99, 71.99, 79.99, 49.99, 6.99, 71.99, 84.99, 38.99, 23.99, 84.99, 32.99, 71.99, 84.99, 71.99, 84.99, 71.99, 12.99, 84.99, 49.99, 71.99, 71.99, 59.99, 59.99, 71.99, 59.99, 71.99, 71.99, 71.99, 53.99, 84.99, 71.99, 72.73, 67.99, 96.98, 58.99, 25.99, 78.0, 94.28, 106.99, 43.09, 63.99, 55.99, 84.99, 84.99, 71.99, 96.99, 71.99, 67.99, 16.99, 53.86, 71.99, 55.99, 79.99, 72.73, 35.99, 79.99, 59.99, 78.0, 27.99, 67.99, 24.99, 71.99, 59.99, 40.99, 44.99, 71.99, 63.99, 71.99, 39.99, 35.99, 35.99, 83.32, 54.0, 55.99, 52.99, 27.99, 29.99, 68.1, 84.99, 43.09, 43.09, 71.99, 88.13, 62.0, 63.99, 96.99, 84.99, 59.99, 50.99, 51.99, 39.99, 60.0, 65.99, 74.07, 71.99, 50.99, 5.99, 20.99, 71.99, 40.39, 53.99, 63.99, 53.99, 50.99, 45.99, 71.96, 48.48, 84.99, 71.99, 84.99, 56.99, 44.99, 53.86, 56.99, 71.99, 32.99, 53.86, 71.99, 64.27, 59.99, 71.99, 79.99, 96.99, 71.99, 84.99, 76.11, 71.99, 67.99, 64.64, 48.99, 71.99, 71.99, 84.99, 71.99, 96.99, 96.99, 53.86, 84.99, 54.99, 53.99, 71.99, 84.99, 71.99, 80.8, 71.99, 84.99, 67.99, 72.0, 71.99, 44.99, 84.99, 59.99, 71.99, 40.99, 96.99, 71.99, 71.99, 49.99, 67.99, 71.99, 84.86, 40.99, 39.99, 44.99, 23.99, 84.99, 67.99, 84.99, 53.99, 35.99, 84.99, 84.99, 76.0, 44.99, 59.99, 14.99, 71.99, 48.48, 56.07, 80.81, 71.99, 47.99, 79.99, 62.0, 25.99, 84.99, 63.99, 29.99, 84.99, 84.99, 34.99, 51.99, 17.99, 43.09, 59.99, 71.99, 60.99, 43.09, 54.99, 15.99, 49.99, 68.1, 72.73, 84.99, 84.99, 38.99, 48.99, 53.66, 47.13, 35.99, 71.99, 46.99, 71.99, 59.99, 31.99, 67.99, 44.99, 69.7, 20.99, 59.25, 32.99, 53.86, 84.99, 84.99, 52.99, 32.66, 67.99, 43.95, 71.96, 56.99, 73.99, 25.99, 59.99, 71.99, 40.39, 84.99, 71.99, 85.54, 64.99, 84.99, 89.99, 15.99, 56.99, 26.99, 71.99, 45.99, 53.99, 96.91, 33.95, 53.99, 63.82, 59.99, 58.99, 29.99, 71.99, 59.99, 46.95, 72.72, 56.99, 31.99, 35.99, 31.99, 66.67, 48.99, 68.99, 29.99, 87.44, 53.86, 71.99, 71.99, 71.99, 67.99, 75.41, 58.99, 83.3, 29.99, 53.86, 96.96, 59.99, 71.99, 71.99, 53.99, 66.49, 84.99, 48.99, 55.99, 27.99, 49.99, 56.99, 49.99, 26.99, 47.99, 54.99, 62.99, 84.99, 64.64, 84.99, 67.99, 32.99, 47.99, 44.99, 47.99, 56.99, 44.99, 7.99, 104.99, 76.0, 10.99, 56.99, 68.1, 53.99, 94.28, 48.49, 71.99, 59.99, 106.39, 71.96, 45.99, 54.99, 22.99, 37.99, 50.0, 19.83, 29.99, 90.92, 64.64, 33.99, 29.99, 16.99, 35.99, 74.07, 56.95, 72.73, 7.99, 27.99, 32.31, 31.99, 25.99, 68.1, 76.11, 53.99, 80.81, 71.99, 75.41, 71.99, 84.99, 45.99, 50.99, 50.99, 63.82, 60.61, 96.99, 72.64, 12.99, 67.99, 56.99, 55.99, 44.99, 31.99, 56.99, 13.99, 35.99, 52.99, 36.95, 35.99, 62.99, 43.99, 106.57, 36.0, 72.73, 80.81, 80.81, 16.99, 37.7, 70.03, 84.99, 71.99, 45.99, 40.99, 71.99, 71.99, 53.99, 40.99, 50.99, 50.99, 71.9, 84.93, 153.86, 27.99, 72.73, 34.99, 52.99, 94.0, 60.61, 48.48, 53.86, 25.99, 44.99, 62.99, 17.99, 25.99, 35.99, 60.91, 83.3, 39.22, 33.99, 44.99, 68.1, 40.04, 146.95, 60.0, 47.99, 74.07, 71.99, 71.99, 84.99, 56.99, 26.99, 71.99, 25.99, 16.99, 21.95, 17.99, 47.99, 55.99, 27.99, 33.41, 69.7, 7.99, 50.91, 70.0, 72.73, 58.99, 94.28, 64.64, 80.8, 49.99, 26.99, 20.99, 35.99, 22.99, 27.99, 25.99, 31.99, 31.99, 34.99, 59.25, 45.99, 35.99, 36.99, 49.99, 14.99, 40.84, 98.55, 89.0, 65.99, 53.88, 72.73, 67.35, 70.03, 31.99, 71.99, 23.99, 29.99, 71.99, 47.46, 141.12, 44.99, 34.99, 42.99, 40.99, 19.95, 22.99, 19.99, 13.99, 130.62, 83.32, 19.95, 69.7, 43.99, 53.86, 72.0, 40.99, 72.73, 64.09, 47.28, 59.99, 27.99, 35.99, 92.14, 72.73, 35.99, 47.99, 29.99, 49.99, 49.99, 37.7, 43.09, 51.99, 39.99, 15.99, 44.99, 44.99, 76.5, 33.99, 46.95, 64.64, 35.99, 11.99, 40.99, 33.99, 49.99, 55.99, 85.54, 19.96, 4.23, 80.63, 25.99, 76.11, 60.88, 77.99, 47.95, 71.99, 71.99, 80.81, 67.35, 48.49, 66.67, 74.08, 54.55, 84.99, 64.64, 31.99, 31.99, 84.99, 39.99, 33.99, 35.99, 83.3, 195.54, 107.7, 106.39, 34.0, 49.99, 49.99, 62.99, 47.99, 40.99, 40.99, 55.99, 71.96, 71.96, 36.65, 85.54, 71.96, 77.39, 77.39, 97.99, 54.99, 62.99, 52.99, 19.99, 63.99, 35.7, 3.99, 8.99, 11.99, 8.99, 91.58, 14.95, 60.61, 47.99, 75.31, 38.79, 18.95, 41.22, 80.75, 57.67, 34.99, 26.99, 71.4, 31.99, 17.99, 62.9, 67.99, 84.84, 67.33, 56.99, 40.99, 42.99, 44.99, 35.99, 49.99, 74.08, 50.84, 16.95, 78.52, 96.15, 32.99, 88.0, 59.99, 40.99, 53.86, 67.99, 31.99, 71.4, 50.99, 40.99, 74.07, 39.99, 39.99, 35.99, 26.99, 40.99, 49.99, 26.99, 27.99, 16.99, 61.2, 88.4, 49.99, 39.99, 49.99, 15.99, 63.82, 50.23, 72.64, 27.99, 15.99, 50.79, 78.52, 60.88, 116.99, 46.45, 83.32, 60.88, 256.95, 347.95, 50.0, 155.99, 49.99, 83.99, 138.95, 150.95, 36.99, 37.99, 234.0, 80.81, 72.73, 60.61, 35.99, 84.99, 40.99, 43.09, 53.86, 37.7, 43.09, 53.86, 70.03, 53.86, 53.86, 32.31, 96.98, 64.64, 20.99, 71.99, 84.99, 53.99, 23.99, 71.99, 49.99, 12.95, 25.99, 32.99, 13.99, 71.99, 39.99, 35.99, 34.99, 49.99, 42.51, 9.9, 81.62, 35.99, 81.62, 76.5, 45.99, 50.99, 42.99, 46.99, 50.99, 55.99, 53.99, 50.99, 49.99, 42.99, 52.99, 33.99, 55.99, 49.99, 44.99, 49.99, 49.99, 44.99, 66.99, 47.99, 40.99, 71.96, 85.54, 71.96, 77.39, 63.82, 85.54, 54.3, 77.39, 90.98, 85.54, 63.82, 85.54, 36.65, 77.39, 54.3, 77.39, 71.96, 77.39, 77.39, 77.39, 71.96, 71.96, 85.54, 72.64, 36.31, 116.23, 232.48, 87.17, 58.11, 217.95, 63.82, 174.99, 45.99, 109.99, 39.99, 5.37, 7.99, 79.91, 87.17, 26.92, 52.99, 52.95, 52.99, 87.44, 32.99, 59.99, 56.99, 57.99, 52.99, 19.99, 44.99, 30.99, 29.05, 203.42, 18.84, 27.12, 31.23, 82.75, 204.05, 6.99, 53.0, 67.33, 7.99, 48.42, 3.99, 80.8, 80.75, 87.44, 9.99, 13.99, 87.44, 87.44, 12.99, 50.99, 24.99, 17.44, 6.99, 37.99, 37.99, 4.99, 46.99, 41.99, 6.99, 52.99, 54.99, 9.95, 13.99, 50.99, 61.99, 6.99, 45.99, 11.99, 84.99, 43.95, 64.27, 53.99, 71.99, 71.99, 67.99, 72.73, 64.64, 58.99, 48.99, 32.31, 71.99, 43.09, 43.09, 72.0, 72.0, 80.81, 96.0, 56.99, 43.09, 48.48, 43.09, 71.99, 48.48, 48.48, 59.25, 84.99, 27.99, 84.0, 57.13, 37.7, 43.09, 43.09, 48.48, 59.99, 64.64, 59.25, 59.99, 37.71, 84.99, 84.99, 59.99, 56.99, 67.35, 84.99, 84.99, 33.35, 71.99, 69.0, 194.99, 70.03, 50.99, 59.99, 53.86, 53.86, 43.09, 81.72, 31.95, 71.99, 48.48, 64.64, 65.38, ...]
print(min(df['updated_price']))
print(max(df['updated_price']))
0.0 2353.37
# defining the bins for distinct price ranges
pr_bins = [0, 100, 500, 1000, 1500, 2000, 2500]
# creating labels for the bins
pr_labels = ['0-100', '101-500', '501-1000', '1001-1500', '1501-2000', '2001-2500']
# creating a new column with the bin labels based on the 'price' column
df['price_range'] = pd.cut(df['price'], bins = pr_bins, labels = pr_labels, right = False)
# grouping by price range and count the occurrences
pr_count = df['price_range'].value_counts().sort_index()
# finally, plotting the bar graph
plt.figure(figsize = (10, 6))
bars = plt.bar(pr_count.index, pr_count.values, color = '#40E0D6')
plt.title('Price Range Distribution', fontdict = {'fontsize': 20})
plt.xlabel('Price Range', fontdict = {'fontsize': 18})
plt.ylabel('Number of Products', fontdict = {'fontsize': 16})
plt.xticks(rotation = 0)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.7)
# adding the count values on top of each bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), va = 'bottom', ha = 'center')
plt.tight_layout()
plt.show()
pr_bins = [0, 20, 40, 60, 80, 100, 300, 600]
pr_labels = ['0-20', '20-40', '40-60', '60-80', '80-100', '100-300', '300-600']
df['price_range'] = pd.cut(df['price'], bins = pr_bins, labels = pr_labels, right = False)
pr_counts = df['price_range'].value_counts().sort_index()
plt.figure(figsize = (10, 6))
plt.bar(pr_counts.index, pr_counts.values, color = '#00E676')
plt.title('Price Range Distribution', fontdict = {'fontsize': 20})
plt.xlabel('Price Range', fontdict = {'fontsize': 18})
plt.ylabel('Number of Products', fontdict = {'fontsize': 16})
plt.xticks(rotation = 0)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.7)
plt.tight_layout()
plt.show()
colors = ['#FF5733', "#c7e9b4", '#EE98AA', '#40E0D0', '#9B59B6', '#FFC300', '#00E676']
plt.figure(figsize = (10, 6))
bars = plt.bar(pr_count.index, pr_count.values, color = colors)
plt.title('Price Range Distribution', fontdict = {'fontsize': 20})
plt.xlabel('Price Range', fontdict = {'fontsize': 18})
plt.ylabel('Number of Products', fontdict = {'fontsize': 16})
plt.xticks(rotation = 0)
plt.grid(axis = 'y', linestyle = '--', alpha = 0.6)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), va = 'bottom', ha = 'center')
plt.tight_layout()
plt.show()
<Figure size 1000x600 with 0 Axes>
Lets check the 'availability status' of books
av_counts = df['availabe'].value_counts()
# creating the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (14, 6))
# plotting the bar graph
sns.barplot(x = av_counts.index, y = av_counts.values, ax = ax1)
ax1.set_title('Availability of Books (Count)', fontdict = {'fontsize': 17})
ax1.set_xlabel('Availability', fontdict = {'fontsize': 16})
ax1.set_ylabel('Count', fontdict = {'fontsize': 16})
# displaying count on top of each bar
for p in ax1.patches:
ax1.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 5), textcoords='offset points')
# plotting the pie chart
ax2.pie(av_counts.values, labels = av_counts.index, autopct = '%1.1f%%', startangle = 140)
ax2.set_title('Availability of Books (Percentage)', fontdict = {'fontsize': 17})
plt.tight_layout()
plt.show()
From our pie chart, we can see that almost all books are available as 99.2 % of books are in stock, whereas only 0.8 % are not available.
Let's do some analysis on our Top 10 authors as per their number of books published.
len(list(df['author'].unique()))
20384
len(list(df['publisher'].unique()))
194
df['publisher'].value_counts()
Springer International Publishing 8432
Packt Publishing 3923
Apress 3061
CRC Press 2719
Springer Berlin Heidelberg 2350
...
TeeBooks 1
Stonesong Digital LLC 1
Tsunami Productions 1
NYU Press 1
University of Chicago Press 1
Name: publisher, Length: 194, dtype: int64
df['author'].value_counts()
Guy Hart-Davis 44
Paul McFedries 43
Michael R. Miller 43
Matthew MacDonald 41
Osvaldo Gervasi 38
..
Gregory Kipper 1
Benjamin A. Lieberman 1
Reynolds M. Salerno 1
Jonathan S. Held 1
Jérôme Lang 1
Name: author, Length: 20384, dtype: int64
ar_counts = df['author'].value_counts().head(10)
colors = ['#4090D0','#C759B4', '#EE98AA', '#D2148C', '#FA5AD2','#FA9BD7', '#D81FD8','#F4A460','#F08080']
plt.figure(figsize = (10, 6))
bars = plt.bar(ar_counts.index, ar_counts.values, color = colors)
plt.title('Top 10 Authors by Number of Books Published', fontdict = {'fontsize': 16})
plt.xlabel('Author', fontdict = {'fontsize': 18})
plt.ylabel('Number of Books Published', fontdict = {'fontsize': 14})
plt.xticks(rotation = 45, ha = 'right')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), va='bottom', ha='center')
plt.tight_layout()
plt.show()
Now, Lets generate a bar plot showing the distribution of editions based on the counts.
df['edition'].unique()
array(['2', nan, '3', '8', '5', '7', '6', '4', '9', '11', '12', '10',
'22', '14', '13', '19', '18', '57', '15', '31', '16', '21'],
dtype=object)
en_counts = df['edition'].value_counts()
colors = ['#FF5733', "#c7e9b4", '#EE98AA', '#40E0D0', '#9B59B6', '#FFC300', '#00E676']
plt.figure(figsize=(10, 6))
bars = plt.bar(en_counts.index, en_counts.values, color = colors)
plt.title("Book's Edition Distribution", fontdict = {'fontsize': 18})
plt.xlabel('Edition', fontdict = {'fontsize': 16})
plt.ylabel('Count', fontdict = {'fontsize': 16})
plt.xticks(rotation = 0, ha = 'right')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), va = 'bottom', ha = 'center')
plt.tight_layout()
plt.show()
Lets visualize the top eight mostly used words in 'description' column
# Creating a counter to check most frequent words
from collections import Counter
c = Counter()
for txt in df.processed_description.values:
for word in txt.split():
c[word] += 1
c.most_common(8)
[('book', 13426),
('conference', 6090),
('held', 5634),
('international', 5477),
('papers', 5104),
('proceedings', 5007),
('constitutes', 4834),
('data', 4648)]
list(c)
['go', 'rapidly', 'become', 'preferred', 'language', 'building', 'web', 'services', 'plenty', 'tutorials', 'available', 'teach', 'syntax', 'developers', 'experience', 'programming', 'languages', 'enough', 'idioms', 'end', 'recreating', 'patterns', 'make', 'sense', 'context', 'practical', 'tidying', 'messy', 'software', 'means', 'breaking', 'code', 'readable', 'using', 'guard', 'clauses', 'helping', 'functions', 'understandable', 'guide', 'author', 'kent', 'beck', 'creator', 'extreme', 'pioneer', 'suggests', 'might', 'apply', 'tidyings', 'recent', 'series', 'breakthroughs', 'deep', 'learning', 'boosted', 'entire', 'field', 'machine', 'even', 'programmers', 'know', 'close', 'nothing', 'technology', 'use', 'simple', 'efficient', 'tools', 'implement', 'programs', 'capable', 'data', 'bestselling', 'book', 'uses', 'concrete', 'examples', 'minimal', 'theory', 'production', 'ready', 'center', 'many', 'challenges', 'system', 'design', 'today', 'difficult', 'issues', 'need', 'figured', 'scalability', 'consistency', 'reliability', 'efficiency', 'maintainability', 'addition', 'overwhelming', 'variety', 'including', 'relational', 'databases', 'nosql', 'datastores', 'stream', 'batch', 'processors', 'message', 'brokers', 'prepare', 'microsoft', 'exam', 'ms', 'help', 'demonstrate', 'real', 'world', 'mastery', 'skills', 'knowledge', 'required', 'deploy', 'manage', 'perform', 'tenant', 'level', 'implementation', 'administration', 'cloud', 'hybrid', 'date', 'mike', 'meyers', 'delivers', 'complete', 'coverage', 'every', 'topic', 'version', 'comptia', 'network', 'certification', 'objectives', 'inside', 'comprehensive', 'resource', 'created', 'edited', 'leading', 'expert', 'training', 'md', 'protect', 'modern', 'endpoints', 'scale', 'environments', 'designed', 'endpoint', 'administrators', 'years', 'companies', 'rewarded', 'effective', 'engineers', 'management', 'positions', 'treating', 'default', 'path', 'engineer', 'leadership', 'ability', 'serve', 'industry', 'staff', 'allows', 'contribute', 'high', 'role', 'models', 'driving', 'big', 'projects', 'want', 'build', 'pages', 'prior', 'friendly', 'perfect', 'place', 'start', 'begin', 'square', 'work', 'steadily', 'create', 'site', 'multicolumn', 'adapt', 'mobile', 'devices', 'fabric', 'lakehouse', 'mesh', 'recently', 'appeared', 'viable', 'alternatives', 'warehouse', 'new', 'architectures', 'solid', 'benefits', 'surrounded', 'lot', 'hyperbole', 'confusion', 'provides', 'guided', 'tour', 'architecture', 'professionals', 'understand', 'pros', 'engineering', 'grown', 'past', 'decade', 'leaving', 'scientists', 'analysts', 'looking', 'view', 'practice', 'learn', 'plan', 'systems', 'needs', 'organization', 'customers', 'evaluating', 'best', 'technologies', 'adoption', 'serverless', 'rise', 'little', 'guidance', 'development', 'teams', 'aws', 'definitive', 'packed', 'architectural', 'security', 'practices', 'architects', 'reliable', 'enterprise', 'waste', 'time', 'bending', 'python', 'fit', 'learned', 'simplicity', 'lets', 'productive', 'quickly', 'often', 'everything', 'offer', 'updated', 'edition', 'hands', 'write', 'leveraging', 'questions', 'net', 'selling', 'answers', 'unusual', 'flexibility', 'breadth', 'continual', 'growth', 'always', 'tradition', 'reilly', 'nutshell', 'guides', 'thoroughly', 'simply', 'volume', 'reference', 'generative', 'ai', 'hottest', 'tech', 'teaches', 'tensorflow', 'keras', 'impressive', 'scratch', 'variational', 'autoencoders', 'vaes', 'adversarial', 'networks', 'gans', 'transformers', 'normalizing', 'flows', 'energy', 'based', 'harder', 'developer', 'chase', 'changing', 'technological', 'trends', 'business', 'domains', 'behind', 'set', 'core', 'principles', 'analyzing', 'understanding', 'strategy', 'handbook', 'manipulating', 'processing', 'cleaning', 'crunching', 'datasets', 'pandas', 'third', 'case', 'studies', 'show', 'solve', 'broad', 'analysis', 'problems', 'effectively', 'latest', 'versions', 'good', 'essential', 'success', 'project', 'designing', 'hard', 'consequences', 'decisions', 'overview', 'experienced', 'thorough', 'unparalleled', 'organizations', 'struggle', 'balance', 'requirements', 'increasing', 'volumes', 'additionally', 'demand', 'large', 'growing', 'among', 'competitive', 'digital', 'industries', 'conventional', 'task', 'finops', 'brings', 'financial', 'accountability', 'variable', 'spend', 'model', 'used', 'majority', 'global', 'enterprises', 'fringe', 'activity', 'de', 'facto', 'discipline', 'managing', 'authors', 'storment', 'fuller', 'outline', 'process', 'culture', 'psychology', 'specifically', 'users', 'behave', 'interact', 'interfaces', 'perhaps', 'single', 'valuable', 'nondesign', 'skill', 'designer', 'elegant', 'fail', 'forces', 'conform', 'instead', 'working', 'within', 'blueprint', 'humans', 'perceive', 'comes', 'choosing', 'maintaining', 'database', 'internals', 'distributed', 'offers', 'differ', 'alex', 'petrov', 'concepts', 'foundation', 'computation', 'writing', 'performance', 'sensitive', 'requires', 'puts', 'control', 'memory', 'processor', 'resources', 'rust', 'combines', 'type', 'catches', 'typescript', 'typed', 'superset', 'javascript', 'potential', 'headaches', 'famous', 'curve', 'take', 'specific', 'ways', 'improve', 'dan', 'vanderkam', 'interested', 'material', 'market', 'seventh', 'represents', 'significant', 'update', 'information', 'ecmascript', 'chapters', 'features', 'conquered', 'identified', 'surveys', 'fastest', 'popular', 'widely', 'consumer', 'across', 'frequently', 'credited', 'massive', 'applications', 'exactly', 'application', 'creates', 'whether', 'consists', 'log', 'messages', 'metrics', 'user', 'outgoing', 'moving', 'important', 'kafka', 'streaming', 'platform', 'handle', 'continues', 'evolve', 'central', 'longer', 'scalable', 'turning', 'value', 'paradigm', 'shift', 'way', 'federate', 'responsibilities', 'others', 'first', 'class', 'tool', 'researchers', 'primarily', 'libraries', 'storing', 'gaining', 'insight', 'several', 'exist', 'individual', 'pieces', 'science', 'stack', 'ipython', 'numpy', 'matplotlib', 'opened', 'artists', 'designers', 'educators', 'beginners', 'short', 'gently', 'introduces', 'computer', 'written', 'co', 'founders', 'reas', 'fry', 'getting', 'started', 'shows', 'easy', 'java', 'usually', 'tackle', 'complexity', 'object', 'oriented', 'oop', 'problem', 'match', 'functional', 'fp', 'another', 'approach', 'solving', 'grasp', 'lambda', 'expressions', 'streams', 'ideal', 'apis', 'fun', 'compelling', 'realistic', 'marc', 'loy', 'patrick', 'niemeyer', 'leuck', 'introduce', 'fundamentals', 'techniques', 'eye', 'parts', 'force', 'choose', 'various', 'compromises', 'think', 'critically', 'trade', 'offs', 'involved', 'veterans', 'practicing', 'grow', 'popularity', 'becoming', 'larger', 'complex', 'taking', 'interest', 'hexagonal', 'clean', 'event', 'driven', 'strategic', 'prescribed', 'domain', 'dd', 'translating', 'renowned', 'experts', 'foster', 'provost', 'tom', 'fawcett', 'fundamental', 'walks', 'analytic', 'thinking', 'necessary', 'extracting', 'useful', 'collect', 'helps', 'greatly', 'expanded', 'introduction', 'randomization', 'probabilistic', 'without', 'slogging', 'manuals', 'head', 'built', 'structures', 'app', 'prime', 'runs', 'still', 'wonder', 'compiler', 'throwing', 'squiggly', 'red', 'lines', 'enter', 'cookbook', 'stefan', 'baumgartner', 'senior', 'solutions', 'everyday', 'quick', 'tightly', 'focused', 'tells', 'long', 'intros', 'bloated', 'samples', 'succinct', 'browse', 'pocket', 'source', 'earlier', 'staring', 'screen', 'idea', 'worry', 'publisher', 'lecturer', 'trainer', 'kevin', 'wilson', 'follow', 'instructions', 'photos', 'illustrations', 'helpful', 'tips', 'video', 'demos', 'extremely', 'suited', 'concurrency', 'ecosystem', 'include', 'lots', 'concurrent', 'locks', 'implementing', 'correctly', 'ordering', 'bugs', 'uncommon', 'monolithic', 'smaller', 'self', 'contained', 'microservices', 'fine', 'grained', 'developing', 'host', 'second', 'takes', 'holistic', 'topics', 'consider', 'scaling', 'unique', 'method', 'goes', 'beyond', 'programmer', 'puzzles', 'mysteries', 'soul', 'searching', 'interviews', 'excel', 'remains', 'ubiquitous', 'feedback', 'forums', 'full', 'requests', 'scripting', 'fact', 'top', 'feature', 'requested', 'makes', 'combination', 'felix', 'zumstein', 'xlwings', 'open', 'package', 'automating', 'reinvent', 'wheel', 'look', 'lessons', 'faced', 'advantage', 'something', 'challenging', 'detangle', 'migrate', 'microservice', 'usual', 'companion', 'sam', 'newman', 'details', 'proven', 'transitioning', 'existing', 'extensively', 'covers', 'advanced', 'haskell', 'frameworks', 'modules', 'toolkits', 'dive', 'actually', 'algorithms', 'introducing', 'beginning', 'bill', 'lubanovic', 'basics', 'varied', 'mixing', 'style', 'recipes', 'explain', 'chapter', 'exercises', 'sophisticated', 'page', 'styling', 'improved', 'accessibility', 'less', 'effort', 'expended', 'revised', 'fifth', 'css', 'along', 'review', 'specifications', 'eric', 'meyer', 'estelle', 'weyl', 'methods', 'construction', 'following', 'presents', 'philosophy', 'creating', 'handy', 'already', 'textbook', 'supports', 'bcs', 'certificate', 'constantly', 'providing', 'constant', 'paradigms', 'incremental', 'developments', 'foundations', 'rethinking', 'changes', 'seamless', 'experiences', 'exploding', 'number', 'channels', 'screens', 'contexts', 'navigate', 'maze', 'options', 'capturing', 'ui', 'major', 'privacy', 'regulations', 'gdpr', 'ccpa', 'expensive', 'notorious', 'breaches', 'never', 'pressure', 'ensure', 'unfortunately', 'integrating', 'complicated', 'give', 'blocks', 'speed', 'helm', 'preeminent', 'manager', 'kubernetes', 'container', 'orchestration', 'efficiently', 'install', 'running', 'containers', 'maintainers', 'matt', 'butcher', 'farina', 'josh', 'dolitsky', 'fits', 'sets', 'laravel', 'apart', 'php', 'starters', 'rapid', 'framework', 'sites', 'fully', 'minibook', 'olivier', 'caelen', 'marie', 'alice', 'blete', 'cover', 'main', 'gpt', 'chatgpt', 'step', 'rival', 'compete', 'attention', 'rest', 'api', 'concise', 'rules', 'drawn', 'stick', 'uri', 'guidelines', 'media', 'types', 'master', 'hidden', 'capabilities', 'related', 'crafting', 'blackbelts', 'test', 'packages', 'small', 'pytest', 'powerful', 'testing', 'tests', 'keep', 'maintainable', 'explore', 'superpowers', 'asserts', 'fixtures', 'parametrization', 'markers', 'plugins', 'plans', 'modernize', 'move', 'legacy', 'private', 'premises', 'solution', 'anybody', 'computing', 'migration', 'transformation', 'cybersecurity', 'broken', 'year', 'attackers', 'remain', 'unchallenged', 'undeterred', 'feel', 'operate', 'secure', 'failure', 'prevented', 'mental', 'incomplete', 'evolves', 'verify', 'expect', 'warehouses', 'lakes', 'lands', 'repositories', 'transformed', 'enabling', 'raw', 'defined', 'dbt', 'bi', 'true', 'oxford', 'researcher', 'accessible', 'history', 'future', 'cutting', 'edge', 'misunderstood', 'artificial', 'intelligence', 'somewhat', 'ill', 'term', 'aim', 'machines', 'conscious', 'aware', 'sentient', 'kind', 'acclaimed', 'andrew', 'hoffman', 'three', 'pillars', 'reconnaissance', 'offense', 'defense', 'examines', 'dozens', 'attacks', 'mitigations', 'threat', 'modeling', 'editions', 'rhcsa', 'rhce', 'exams', 'confidence', ...]
y = [count for x, count in c.most_common(8)]
x = [x for x, count in c.most_common(8)]
plt.figure(figsize = (10, 6))
plt.barh(x, y, color = 'crimson')
plt.title("The topmost frequent used words", fontdict = {'fontsize': 18})
plt.xlabel('Count', fontdict = {'fontsize': 16})
plt.ylabel("Most frequent words", fontdict = {'fontsize': 16})
plt.show()
Function to create a Word Cloud
The most frequent words in out 'processed_title' feature
t = ' '.join(word for word in df['processed_title'].astype(str))
def word_cloud(text):
word_cloud = WordCloud(width = 1600, height = 800, random_state = 21,
max_font_size = 110, collocations=False,
min_font_size = 18).generate_from_text(t)
plt.figure(figsize = (15, 10))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
word_cloud(word_cloud)
df['Length_of_description'] = df['description'].apply(lambda x: len(str(x)) if pd.notna(x) else np.nan)
# distribution of length of description
plt.figure(figsize=(20, 8))
plt.hist(df['Length_of_description'].dropna(), color='lightpink', edgecolor='black')
plt.xlabel('Length of Description')
plt.ylabel('Frequency')
plt.title('Distribution of Description Lengths',fontdict={'fontsize':24})
plt.grid(True)
plt.show()
We notice that:
Most of the descriptions are of the length 250, meaning the average length of descriptions is around 250 words.
this could be true as this is considered standard for a descripton to not be more than 250 words
Pandas Profiling is a tool used for EDA (Exploratory Data Analysis)
It provides comprehensive insights, enhances data quality
We get the following details by using pandas profiling:
Lets use the ydata-profiling to generate the detailed reports, complete with statistics and visualizations.
report=pp.ProfileReport(df)
report.to_file('final_version.html')
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
report